pacman::p_load(foreign, dplyr, tidyr, tibble, xtable,reshape,questionr)

sum.na <- function(x){
  sum(x,na.rm=T)
}
precinct.num <- function(df) {
  vec1<-c()
  vec2<-c()
  frac <- df%>%
    dplyr::select(is.numeric)
  for (z in 1:ncol(frac)){
    vec2<-append(as.numeric(apply(frac[,z]%>%
                                    as.data.frame(.)%>%
                                    na.omit(.),2,length)),vec2)
  }
  for (i in 1:(ncol(frac)-1)){
    for (j in (i+1):ncol(frac)){
      vec1<-append((sum(complete.cases(frac[,i], frac[,j]))),vec1)
      print(sum(complete.cases(frac[,i], frac[,j])))
    }
  }
  return(list(max.complete=max(vec1),
              min.complete=min(vec1),
              min.obs=min(vec2),
              max.obs=max(vec2)))
}#extract number of 1) complete observations across candidates and 2) number of non-NA values for each candidate

setwd("")#set working directory

#2008 Dem (Pres.)----

dta <- read.csv("Unified Even Years/ER 2008 Primary/prcres.txt",header=F)

colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"

#create unique identifier
dta$unique.precinct <- as.numeric(dta$county.code)*10000000+as.numeric(dta$precinct.num)
freq(dta$unique.precinct)
#subset office
dta.pres.08 <- dta[dta$office.code=="USP",]
#subset party 
dta.pres.08.dem <- dta.pres.08[dta.pres.08$party.code=="DEM",]
dta.pres.dem.08.sub <- subset(dta.pres.08.dem,select=c("unique.precinct","last.name","votes"))
dta.pres.dem.08.sub.wide <- reshape(dta.pres.dem.08.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.pres.dem.08.sub.wide$total.votes <- apply(dta.pres.dem.08.sub.wide[,-1],1,sum.na)
dta.pres.dem.08.sub.wide$clinton.08.pct <- dta.pres.dem.08.sub.wide$votes.CLINTON/dta.pres.dem.08.sub.wide$total.votes
dta.pres.dem.08.sub.wide$obama.08.pct <- dta.pres.dem.08.sub.wide$votes.OBAMA/dta.pres.dem.08.sub.wide$total.votes

#voteshare totals 
dta.pres.dem.08.voteshare <- dta.pres.dem.08.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2008)
dta.pres.dem.08.voteshare$voteshare <- dta.pres.dem.08.voteshare$voteshare / sum(dta.pres.dem.08.voteshare$voteshare)

#2008 GOP (Pres.)-----
#subset party 
dta.pres.08.gop <- dta.pres.08[dta.pres.08$party.code=="REP",]
dta.pres.gop.08.sub <- subset(dta.pres.08.gop,select=c("unique.precinct","last.name","votes"))
dta.pres.gop.08.sub.wide <- reshape(dta.pres.gop.08.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.pres.gop.08.sub.wide$total.votes <- apply(dta.pres.gop.08.sub.wide[,-1],1,sum.na)
dta.pres.gop.08.sub.wide$mccain.08.pct <- dta.pres.gop.08.sub.wide$votes.MCCAIN/dta.pres.gop.08.sub.wide$total.votes
dta.pres.gop.08.sub.wide$huckabee.08.pct <- dta.pres.gop.08.sub.wide$votes.HUCKABEE/dta.pres.gop.08.sub.wide$total.votes
dta.pres.gop.08.sub.wide$paul.08.pct <- dta.pres.gop.08.sub.wide$votes.PAUL/dta.pres.gop.08.sub.wide$total.votes

#voteshare totals 
dta.pres.gop.08.voteshare <- dta.pres.gop.08.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2008)
dta.pres.gop.08.voteshare$voteshare <- dta.pres.gop.08.voteshare$voteshare / sum(dta.pres.gop.08.voteshare$voteshare)

#2010 Dem (Senate) ------ 
dta <- read.csv("Unified Even Years/ER 2010 Primary/2010_senate_primary_precinctreturns.csv",header=F) %>% 
  slice(-1)#first row duplicates colnames

colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"
#create unique identifier
dta$unique.precinct <- as.numeric(dta$county.code)*10000000+as.numeric(dta$precinct.num)
#subset office
dta.sen.10 <- dta[dta$office.code=="USS",]
#subset party
dta.sen.dem.10 <- dta.sen.10[dta.sen.10$party.code=="DEM",]
dta.sen.dem.10.sub <- subset(dta.sen.dem.10,select=c("unique.precinct","last.name","votes"))
dta.sen.dem.10.sub.wide <- reshape(dta.sen.dem.10.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
#convert vote columns to numeric
dta.sen.dem.10.sub.wide$votes.SESTAK<-as.numeric(dta.sen.dem.10.sub.wide$votes.SESTAK)
dta.sen.dem.10.sub.wide$votes.SPECTER<-as.numeric(dta.sen.dem.10.sub.wide$votes.SPECTER)
dta.sen.dem.10.sub.wide$`votes.WRITE-IN`<-as.numeric(dta.sen.dem.10.sub.wide$`votes.WRITE-IN`)
#extract vote shares
dta.sen.dem.10.sub.wide$total.votes <- apply(dta.sen.dem.10.sub.wide[,-1],1,sum.na)
dta.sen.dem.10.sub.wide$sestak.10.pct <- dta.sen.dem.10.sub.wide$votes.SESTAK/dta.sen.dem.10.sub.wide$total.votes
dta.sen.dem.10.sub.wide$specter.10.pct <- dta.sen.dem.10.sub.wide$votes.SPECTER/dta.sen.dem.10.sub.wide$total.votes

#voteshare totals 
dta.sen.dem.10.voteshare <- dta.sen.dem.10.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2010)
dta.sen.dem.10.voteshare$voteshare <- dta.sen.dem.10.voteshare$voteshare / sum(dta.sen.dem.10.voteshare$voteshare)


#2012 GOP (Pres.) ---- 
dta <- read.csv("Unified Even Years/ER 2012 Primary/ERStat.txt",header=F)
colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"
#create unique identifier
dta$unique.precinct <- dta$county.code*10000000+dta$precinct.num
#subset office
dta.pres.12 <- dta[dta$office.code=="USP",]
dta.pres.12.gop <- dta.pres.12[dta.pres.12$party.code=="REP",]
dta.pres.gop.12.sub <- subset(dta.pres.12.gop,select=c("unique.precinct","last.name","votes"))
dta.pres.gop.12.sub.wide <- reshape(dta.pres.gop.12.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.pres.gop.12.sub.wide$total.votes <- apply(dta.pres.gop.12.sub.wide[,-1],1,sum.na)
dta.pres.gop.12.sub.wide$romney.12.pct <- dta.pres.gop.12.sub.wide$votes.ROMNEY/dta.pres.gop.12.sub.wide$total.votes
dta.pres.gop.12.sub.wide$gingrich.12.pct <- dta.pres.gop.12.sub.wide$votes.GINGRICH/dta.pres.gop.12.sub.wide$total.votes
dta.pres.gop.12.sub.wide$paul.12.pct <- dta.pres.gop.12.sub.wide$votes.PAUL/dta.pres.gop.12.sub.wide$total.votes
dta.pres.gop.12.sub.wide$santorum.12.pct <- dta.pres.gop.12.sub.wide$votes.SANTORUM/dta.pres.gop.12.sub.wide$total.votes

#voteshare totals 
dta.pres.gop.12.voteshare <- dta.pres.gop.12.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2012)
dta.pres.gop.12.voteshare$voteshare <- dta.pres.gop.12.voteshare$voteshare / sum(dta.pres.gop.12.voteshare$voteshare)

#2012 GOP (Senate)-----
dta.sen.12 <- dta[dta$office.code=="USS",]
dta.sen.12.gop <- dta.sen.12[dta.sen.12$party.code=="REP",]
dta.sen.gop.12.sub <- subset(dta.sen.12.gop,select=c("unique.precinct","last.name","votes"))
dta.sen.gop.12.sub.wide <- reshape(dta.sen.gop.12.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.sen.gop.12.sub.wide$total.votes <- apply(dta.sen.gop.12.sub.wide[,-1],1,sum.na)
dta.sen.gop.12.sub.wide$smith.12.pct <- dta.sen.gop.12.sub.wide$votes.SMITH/dta.sen.gop.12.sub.wide$total.votes
dta.sen.gop.12.sub.wide$welch.12.pct <- dta.sen.gop.12.sub.wide$votes.WELCH/dta.sen.gop.12.sub.wide$total.votes
dta.sen.gop.12.sub.wide$rohrer.12.pct <- dta.sen.gop.12.sub.wide$votes.ROHRER/dta.sen.gop.12.sub.wide$total.votes
dta.sen.gop.12.sub.wide$christian.12.pct <- dta.sen.gop.12.sub.wide$votes.CHRISTIAN/dta.sen.gop.12.sub.wide$total.votes

#voteshare totals 
dta.sen.gop.12.voteshare <- dta.sen.gop.12.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2012)
dta.sen.gop.12.voteshare$voteshare <- dta.sen.gop.12.voteshare$voteshare / sum(dta.sen.gop.12.voteshare$voteshare)

#2016 GOP (Pres.) ----- 
dta <- read.csv("PA-precincts-2020/Unified Even Years/ER 2016 Primary/ERStat_2016_P(528159)_20160728.txt",header=F)
colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"

#create unique identifier
dta$unique.precinct <- dta$county.code*10000000+dta$precinct.num
#subset office
dta.pres.16 <- dta[dta$office.code=="USP",]
#subset party
dta.pres.gop.16 <- dta.pres.16[dta.pres.16$party.code=="REP",]
dta.pres.gop.16.sub <- subset(dta.pres.gop.16,select=c("unique.precinct","last.name","votes"))
dta.pres.gop.16.sub.wide <- reshape(dta.pres.gop.16.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.pres.gop.16.sub.wide$total.votes <- apply(dta.pres.gop.16.sub.wide[,-1],1,sum.na)
dta.pres.gop.16.sub.wide$trump.16.pct <- dta.pres.gop.16.sub.wide$votes.TRUMP/dta.pres.gop.16.sub.wide$total.votes
dta.pres.gop.16.sub.wide$cruz.16.pct <- dta.pres.gop.16.sub.wide$votes.CRUZ/dta.pres.gop.16.sub.wide$total.votes
dta.pres.gop.16.sub.wide$kasich.16.pct <- dta.pres.gop.16.sub.wide$votes.KASICH/dta.pres.gop.16.sub.wide$total.votes

#voteshare totals 
dta.pres.gop.16.voteshare <- dta.pres.gop.16.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(votes))%>%
  mutate(year=2016)
dta.pres.gop.16.voteshare$voteshare <- dta.pres.gop.16.voteshare$voteshare / sum(dta.pres.gop.16.voteshare$voteshare)

#2016 Dem (Pres.)------ 
#subset office
dta.pres.dem.16 <- dta.pres.16[dta.pres.16$party.code=="DEM",]
dta.pres.dem.16.sub <- subset(dta.pres.dem.16,select=c("unique.precinct","last.name","votes"))
dta.pres.dem.16.sub.wide <- reshape(dta.pres.dem.16.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
#extract vote shares
dta.pres.dem.16.sub.wide$total.votes <- apply(dta.pres.dem.16.sub.wide[,-1],1,sum.na)
dta.pres.dem.16.sub.wide$clinton.16.pct <- dta.pres.dem.16.sub.wide$votes.CLINTON/dta.pres.dem.16.sub.wide$total.votes
dta.pres.dem.16.sub.wide$sanders.16.pct <- dta.pres.dem.16.sub.wide$votes.SANDERS/dta.pres.dem.16.sub.wide$total.votes

#voteshare totals
dta.pres.dem.16.voteshare <- dta.pres.dem.16.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2016)
dta.pres.dem.16.voteshare$voteshare <- dta.pres.dem.16.voteshare$voteshare / sum(dta.pres.dem.16.voteshare$voteshare)

#2016 Dem (Senate)----- 
#subset office
dta.sen.16 <- dta[dta$office.code=="USS",]
#subset party
dta.sen.dem.16 <- dta.sen.16[dta.sen.16$party.code=="DEM",]
dta.sen.dem.16.sub <- subset(dta.sen.dem.16,select=c("unique.precinct","last.name","votes"))
dta.sen.dem.16.sub.wide <- reshape(dta.sen.dem.16.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
#convert vote columns to numeric
dta.sen.dem.16.sub.wide$total.votes <- apply(dta.sen.dem.16.sub.wide[,-1],1,sum.na)
dta.sen.dem.16.sub.wide$fetterman.16.pct <- dta.sen.dem.16.sub.wide$votes.FETTERMAN/dta.sen.dem.16.sub.wide$total.votes
dta.sen.dem.16.sub.wide$sestak.16.pct <- dta.sen.dem.16.sub.wide$votes.SESTAK/dta.sen.dem.16.sub.wide$total.votes
dta.sen.dem.16.sub.wide$mcginty.16.pct <- dta.sen.dem.16.sub.wide$votes.MCGINTY/dta.sen.dem.16.sub.wide$total.votes

#voteshare totals
dta.sen.dem.16.voteshare <- dta.sen.dem.16.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2016)
dta.sen.dem.16.voteshare$voteshare <- dta.sen.dem.16.voteshare$voteshare / sum(dta.sen.dem.16.voteshare$voteshare)

#2018 GOP (Senate) ----- 
dta <- read.csv("PA-precincts-2020/Unified Even Years/ER 2018 Primary/ERStat_2018_P(215443)_20180730.txt",header=F)
colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"

#create unique identifier
dta$unique.precinct <- dta$county.code*10000000+dta$precinct.num
#subset office
dta.sen.18 <- dta[dta$office.code=="USS",]
#subset party
dta.sen.gop.18 <- dta.sen.18[dta.sen.18$party.code=="REP",]
dta.sen.gop.18.sub <- subset(dta.sen.gop.18,select=c("unique.precinct","last.name","votes"))
dta.sen.gop.18.sub.wide <- reshape(dta.sen.gop.18.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.sen.gop.18.sub.wide$total.votes <- apply(dta.sen.gop.18.sub.wide[,-1],1,sum.na)
dta.sen.gop.18.sub.wide$barletta.18.pct <- dta.sen.gop.18.sub.wide$votes.BARLETTA/dta.sen.gop.18.sub.wide$total.votes
dta.sen.gop.18.sub.wide$christiana.18.pct <- dta.sen.gop.18.sub.wide$votes.CHRISTIANA/dta.sen.gop.18.sub.wide$total.votes

#voteshare totals 
dta.sen.gop.18.voteshare <- dta.sen.gop.18.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(votes))%>%
  mutate(year=2018)
dta.sen.gop.18.voteshare$voteshare <- dta.sen.gop.18.voteshare$voteshare / sum(dta.sen.gop.18.voteshare$voteshare)

#2020 Dem (Pres.)------

dta <- read.csv("Unified Even Years/ER 2020 Primary/ElectionReturns_2020_Primary_PrecinctReturns.txt",header=F)
colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"
#create unique identifier
dta$unique.precinct <- dta$county.code*10000000+dta$precinct.num
#subset office
dta.pres.20 <- dta[dta$office.code=="USP",]
#subset party
dta.pres.dem.20 <- dta.pres.20[dta.pres.20$party.code=="DEM",]
dta.pres.dem.20.sub <- subset(dta.pres.dem.20,select=c("unique.precinct","last.name","votes"))
dta.pres.dem.20.sub.wide <- reshape(dta.pres.dem.20.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.pres.dem.20.sub.wide$total.votes <- apply(dta.pres.dem.20.sub.wide[,-1],1,sum.na)
dta.pres.dem.20.sub.wide$sanders.20.pct <- dta.pres.dem.20.sub.wide$votes.SANDERS/dta.pres.dem.20.sub.wide$total.votes
dta.pres.dem.20.sub.wide$biden.20.pct <- dta.pres.dem.20.sub.wide$votes.BIDEN/dta.pres.dem.20.sub.wide$total.votes
dta.pres.dem.20.sub.wide$gabbard.20.pct <- dta.pres.dem.20.sub.wide$votes.GABBARD/dta.pres.dem.20.sub.wide$total.votes

#voteshare totals 
dta.pres.dem.20.voteshare <- dta.pres.dem.20.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2020)
dta.pres.dem.20.voteshare$voteshare <- dta.pres.dem.20.voteshare$voteshare / sum(dta.pres.dem.20.voteshare$voteshare)

# 2022 GOP (Senate)------ 
dta <- read.csv("Unified Even Years/ER 2022 Primary/ElectionReturns_2022_Primary_PrecinctReturns.txt",header=F)
colnames(dta)[3] <- "county.code"
colnames(dta)[4] <- "precinct.num"
colnames(dta)[6] <- "district"
colnames(dta)[9] <- "office.code"
colnames(dta)[10] <- "party.code"
colnames(dta)[12] <- "last.name"
colnames(dta)[13] <- "first.name"
colnames(dta)[16] <- "votes"
#create unique identifier
dta$unique.precinct <- dta$county.code*10000000+dta$precinct.num
#subset office
dta.sen.22 <- dta[dta$office.code=="USS",]
#susbset party
dta.sen.gop.22 <- dta.sen.22[dta.sen.22$party.code=="REP",]
dta.sen.gop.22.sub <- subset(dta.sen.gop.22,select=c("unique.precinct","last.name","votes"))
dta.sen.gop.22.sub.wide <- reshape(dta.sen.gop.22.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.sen.gop.22.sub.wide$total.votes <- apply(dta.sen.gop.22.sub.wide[,-1],1,sum.na)
dta.sen.gop.22.sub.wide$barnette.22.pct <- dta.sen.gop.22.sub.wide$votes.BARNETTE/dta.sen.gop.22.sub.wide$total.votes
dta.sen.gop.22.sub.wide$oz.22.pct <- dta.sen.gop.22.sub.wide$votes.OZ/dta.sen.gop.22.sub.wide$total.votes
dta.sen.gop.22.sub.wide$mccormick.22.pct <- dta.sen.gop.22.sub.wide$votes.MCCORMICK/dta.sen.gop.22.sub.wide$total.votes

#voteshare totals 
dta.sen.gop.22.voteshare <- dta.sen.gop.22.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2022)
dta.sen.gop.22.voteshare$voteshare <- dta.sen.gop.22.voteshare$voteshare / sum(dta.sen.gop.22.voteshare$voteshare)

#2022 Dem (Senate)------
dta.sen.dem.22 <- dta.sen.22[dta.sen.22$party.code=="DEM",]
dta.sen.dem.22.sub <- subset(dta.sen.dem.22,select=c("unique.precinct","last.name","votes"))
dta.sen.dem.22.sub.wide <- reshape(dta.sen.dem.22.sub,idvar="unique.precinct",timevar="last.name",direction="wide")
dta.sen.dem.22.sub.wide$total.votes <- apply(dta.sen.dem.22.sub.wide[,-1],1,sum.na)
dta.sen.dem.22.sub.wide$lamb.22.pct <- dta.sen.dem.22.sub.wide$votes.LAMB/dta.sen.dem.22.sub.wide$total.votes
dta.sen.dem.22.sub.wide$fetterman.22.pct <- dta.sen.dem.22.sub.wide$votes.FETTERMAN/dta.sen.dem.22.sub.wide$total.votes
dta.sen.dem.22.sub.wide$kenyatta.22.pct <- dta.sen.dem.22.sub.wide$votes.KENYATTA/dta.sen.dem.22.sub.wide$total.votes
dta.sen.dem.22.sub.wide
#voteshare totals 
dta.sen.dem.22.voteshare <- dta.sen.dem.22.sub %>%
  group_by(last.name) %>%
  dplyr::summarise(voteshare = sum(as.numeric(votes)))%>%
  mutate(year=2022)
dta.sen.dem.22.voteshare$voteshare <- dta.sen.dem.22.voteshare$voteshare / sum(dta.sen.dem.22.voteshare$voteshare)

# Export vote shares ---- 
pa.voteshares <- rbind(dta.sen.gop.22.voteshare, dta.sen.dem.22.voteshare, #excluding GOP 2012 senate race because it is not competitive by the 25% for the top two candidates benchmark
                       dta.pres.gop.08.voteshare, dta.pres.gop.12.voteshare, dta.pres.gop.16.voteshare,
                       dta.pres.dem.08.voteshare, dta.pres.dem.16.voteshare, dta.sen.dem.16.voteshare, 
                       dta.pres.dem.20.voteshare, dta.sen.gop.18.voteshare, dta.sen.dem.10.voteshare) %>%
  dplyr::rename(cand=last.name)
write.csv(pa.voteshares, "pa-voteshares.csv")
colnames(dta.pres.gop.16.sub.wide)

# Export corrs ---- 
# merge reps
gop.merged <- dta.pres.gop.08.sub.wide %>%
  full_join(dta.pres.gop.12.sub.wide, by="unique.precinct") %>% 
  full_join(dta.pres.gop.16.sub.wide, by="unique.precinct") %>% 
  full_join(dta.sen.gop.18.sub.wide, by="unique.precinct") %>%
  full_join(dta.sen.gop.22.sub.wide, by="unique.precinct") %>%
  dplyr::select(-unique.precinct) %>%
  subset(.,select=c("mccain.08.pct", "huckabee.08.pct", "paul.08.pct" ,
                    "paul.12.pct", "romney.12.pct", "gingrich.12.pct", "santorum.12.pct", 
                    "trump.16.pct","cruz.16.pct","kasich.16.pct",
                    "barletta.18.pct","christiana.18.pct",
                    "barnette.22.pct","oz.22.pct","mccormick.22.pct"))

cormatd <- cor(gop.merged,use="pairwise.complete.obs")
cormatd2 <- cormatd[,-c(1,2,3)]

print(xtable(cormatd2, 
             label="f:pagop", 
             caption=c("Correlations of vote shares in Republican primaries from all Pennsylvania precincts (n=8151-9008)."),
             digits=c(0,rep(3,12))),
      scalebox=0.55, 
      file=c("tableA18.tex"))#table A18

tableA18 <- readLines("tableA18.tex")
tableA18 <- gsub("mccain.08.pct", "McCain '08", tableA18)
tableA18 <- gsub("huckabee.08.pct", "Huckabee '08", tableA18)
tableA18 <- gsub("paul.08.pct", "Paul '08", tableA18)
tableA18 <- gsub("paul.12.pct", "Paul '12", tableA18)
tableA18 <- gsub("romney.12.pct", "Romney '12", tableA18)
tableA18 <- gsub("gingrich.12.pct", "Gingrich '12", tableA18)
tableA18 <- gsub("santorum.12.pct", "Santorum '12", tableA18)
tableA18 <- gsub("trump.16.pct", "Trump '16", tableA18)
tableA18 <- gsub("cruz.16.pct", "Cruz '16", tableA18)
tableA18 <- gsub("kasich.16.pct", "Kasich '16", tableA18)
tableA18 <- gsub("barletta.18.pct", "Barletta '18", tableA18)
tableA18 <- gsub("christiana.18.pct", "Christiana '18", tableA18)
tableA18 <- gsub("barnette.22.pct", "Barnette '22", tableA18)
tableA18 <- gsub("oz.22.pct", "Oz '22", tableA18)
tableA18 <- gsub("mccormick.22.pct", "McCormick '22", tableA18)
writeLines(tableA18, "tableA18.tex")

#preapre corrs for regression 
cormatd3 <- data.frame(cbind(variable_2 = rownames(cormatd2), cormatd2))
cormatd4 <- melt(cormatd3, id = c("variable_2"))

cormatd4$year_cand1_0 <- ifelse(grepl("08.pct", cormatd4$variable_2), 2008, 0)
cormatd4$year_cand1_1 <- ifelse(grepl("10.pct", cormatd4$variable_2), 2010, 0)
cormatd4$year_cand1_2 <- ifelse(grepl("12.pct", cormatd4$variable_2), 2012, 0)
cormatd4$year_cand1_3 <- ifelse(grepl("16.pct", cormatd4$variable_2), 2016, 0)
cormatd4$year_cand1_4 <- ifelse(grepl("18.pct", cormatd4$variable_2), 2018, 0)
cormatd4$year_cand1_5 <- ifelse(grepl("20.pct", cormatd4$variable_2), 2020, 0)
cormatd4$year_cand1_6 <- ifelse(grepl("22.pct", cormatd4$variable_2), 2022, 0)
cormatd4$year_cand1 <- cormatd4$year_cand1_1 + cormatd4$year_cand1_2 + cormatd4$year_cand1_3 + cormatd4$year_cand1_4 + cormatd4$year_cand1_0 + cormatd4$year_cand1_5 + cormatd4$year_cand1_6
cormatd4$year_cand2_0 <- ifelse(grepl("08.pct", cormatd4$variable), 2008, 0)
cormatd4$year_cand2_1 <- ifelse(grepl("10.pct", cormatd4$variable), 2010, 0)
cormatd4$year_cand2_2 <- ifelse(grepl("12.pct", cormatd4$variable), 2012, 0)
cormatd4$year_cand2_3 <- ifelse(grepl("16.pct", cormatd4$variable), 2016, 0)
cormatd4$year_cand2_4 <- ifelse(grepl("18.pct", cormatd4$variable), 2018, 0)
cormatd4$year_cand2_5 <- ifelse(grepl("20.pct", cormatd4$variable), 2020, 0)
cormatd4$year_cand2_6 <- ifelse(grepl("22.pct", cormatd4$variable), 2022, 0)
cormatd4$year_cand2 <- cormatd4$year_cand2_1 + cormatd4$year_cand2_2 + cormatd4$year_cand2_3 + cormatd4$year_cand2_4 + cormatd4$year_cand2_0 + cormatd4$year_cand2_5 + cormatd4$year_cand2_6
cormatd5 <-cormatd4 %>%
  subset(., select=c(variable_2, year_cand1, variable, year_cand2, value))

cormatd5$value <- as.numeric(cormatd5$value)
cormatd5$variable <- as.character(cormatd5$variable)
cormatd5$party <- "GOP"
cormatd5$state <- 'PA'

gop.grouped<-cormatd5%>%
  mutate(year_cand1=as.character(year_cand1))%>%
  mutate(year_cand2=as.character(year_cand2))%>%
  filter(year_cand1!=year_cand2)%>%
  filter(variable_2!=variable)%>%
  group_by(party,year_cand1,year_cand2)%>%
  dplyr::summarise(cor_mean=mean(value,na.rm=T), 
                   cor_median=median(value,na.rm=T))

write.csv(cormatd5, "pa_gop_groupbyyear_corrs.csv")

# merge dems
dem.merged <- dta.pres.dem.08.sub.wide %>%
  full_join(dta.sen.dem.10.sub.wide, by="unique.precinct") %>% 
  full_join(dta.sen.dem.16.sub.wide, by="unique.precinct") %>% 
  full_join(dta.pres.dem.16.sub.wide, by="unique.precinct") %>% 
  full_join(dta.pres.dem.20.sub.wide, by="unique.precinct") %>%
  full_join(dta.sen.dem.22.sub.wide, by="unique.precinct")%>%
  dplyr::select(-unique.precinct) %>%
    subset(.,select=c("clinton.08.pct","obama.08.pct", 
                      "sestak.10.pct", "specter.10.pct", 
                      "clinton.16.pct","sanders.16.pct",
                      "fetterman.16.pct" ,"sestak.16.pct" ,"mcginty.16.pct",
                      "sanders.20.pct", "biden.20.pct",
                      "fetterman.22.pct","lamb.22.pct","kenyatta.22.pct"))
cormatd <- cor(dem.merged,use="pairwise.complete.obs")
cormatd2 <- cormatd[,-c(1,2)]

precinct.num(gop.merged)

print(xtable(cormatd2, 
             label="f:padem", 
             caption=c("Correlations of vote shares in Democratic primaries from all Pennsylvania precincts (n=8932-9265)."),
             digits=c(0,rep(3,12))),
      scalebox=0.55, 
      file=c("tableA19.tex"))#table A19

tableA19 <- readLines("tableA19.tex")
tableA19 <- gsub("clinton.08.pct", "Clinton '08", tableA19)
tableA19 <- gsub("obama.08.pct", "Obama '08", tableA19)
tableA19 <- gsub("sestak.10.pct", "Sestak '10", tableA19)
tableA19 <- gsub("specter.10.pct", "Specter '10", tableA19)
tableA19 <- gsub("clinton.16.pct", "Clinton '16", tableA19)
tableA19 <- gsub("sanders.16.pct", "Sanders '16", tableA19)
tableA19 <- gsub("fetterman.16.pct", "Fetterman '16", tableA19)
tableA19 <- gsub("sestak.16.pct", "Sestak '16", tableA19)
tableA19 <- gsub("mcginty.16.pct", "McGinty '16", tableA19)
tableA19 <- gsub("sanders.20.pct", "Sanders '20", tableA19)
tableA19 <- gsub("biden.20.pct", "Biden '20", tableA19)
tableA19 <- gsub("fetterman.22.pct", "Fetterman '22", tableA19)
tableA19 <- gsub("lamb.22.pct", "Lamb '22", tableA19)
tableA19 <- gsub("kenyatta.22.pct", "Kenyatta '22", tableA19)
writeLines(tableA19, "tableA19.tex")

#preapre corrs for regression 
cormatd3 <- data.frame(cbind(variable_2 = rownames(cormatd2), cormatd2))
cormatd4 <- melt(cormatd3, id = c("variable_2"))

cormatd4$year_cand1_0 <- ifelse(grepl("08.pct", cormatd4$variable_2), 2008, 0)
cormatd4$year_cand1_1 <- ifelse(grepl("10.pct", cormatd4$variable_2), 2010, 0)
cormatd4$year_cand1_2 <- ifelse(grepl("12.pct", cormatd4$variable_2), 2012, 0)
cormatd4$year_cand1_3 <- ifelse(grepl("16.pct", cormatd4$variable_2), 2016, 0)
cormatd4$year_cand1_4 <- ifelse(grepl("18.pct", cormatd4$variable_2), 2018, 0)
cormatd4$year_cand1_5 <- ifelse(grepl("20.pct", cormatd4$variable_2), 2020, 0)
cormatd4$year_cand1_6 <- ifelse(grepl("22.pct", cormatd4$variable_2), 2022, 0)
cormatd4$year_cand1 <- cormatd4$year_cand1_1 + cormatd4$year_cand1_2 + cormatd4$year_cand1_3 + cormatd4$year_cand1_4 + cormatd4$year_cand1_0 + cormatd4$year_cand1_5 + cormatd4$year_cand1_6
cormatd4$year_cand2_0 <- ifelse(grepl("08.pct", cormatd4$variable), 2008, 0)
cormatd4$year_cand2_1 <- ifelse(grepl("10.pct", cormatd4$variable), 2010, 0)
cormatd4$year_cand2_2 <- ifelse(grepl("12.pct", cormatd4$variable), 2012, 0)
cormatd4$year_cand2_3 <- ifelse(grepl("16.pct", cormatd4$variable), 2016, 0)
cormatd4$year_cand2_4 <- ifelse(grepl("18.pct", cormatd4$variable), 2018, 0)
cormatd4$year_cand2_5 <- ifelse(grepl("20.pct", cormatd4$variable), 2020, 0)
cormatd4$year_cand2_6 <- ifelse(grepl("22.pct", cormatd4$variable), 2022, 0)
cormatd4$year_cand2 <- cormatd4$year_cand2_1 + cormatd4$year_cand2_2 + cormatd4$year_cand2_3 + cormatd4$year_cand2_4 + cormatd4$year_cand2_0 + cormatd4$year_cand2_5 + cormatd4$year_cand2_6
cormatd5 <-cormatd4 %>%
  subset(., select=c(variable_2, year_cand1, variable, year_cand2, value))

cormatd5$value <- as.numeric(cormatd5$value)
cormatd5$variable <- as.character(cormatd5$variable)
cormatd5$party <- "DEM"
cormatd5$state <- 'PA'

dems.grouped<-cormatd5%>%
  mutate(year_cand1=ifelse(variable_2 %in% c("mcginty.16.pct","sestak.16.pct","fetterman.16.pct"), "2016*", as.character(year_cand1)))%>%
  mutate(year_cand2=ifelse(variable %in% c("mcginty.16.pct","sestak.16.pct","fetterman.16.pct"), "2016*", as.character(year_cand2)))%>%
  filter(year_cand1!=year_cand2)%>%
  filter(variable_2!=variable)%>%
  group_by(party,year_cand1,year_cand2)%>%
  dplyr::summarise(cor_mean=mean(value,na.rm=T), 
                   cor_median=median(value,na.rm=T))

write.csv(cormatd5, "pa_dems_groupbyyear_corrs.csv")

#prepare corr summary table
corr.summary<-rbind(dems.grouped, gop.grouped)%>%
  setNames(c("Party","Year 1", "Year 2", "Corr. Mean", "Corr. Median"))%>%
  mutate(Party=Party%>%recode(
    "DEM"="Democrats", 
    "GOP"="Republicans"
  ))

# GE comparison ------ 

### 2008 (Pres.) ----- 
gen.2008 <- read.csv("Unified Even Years/ER 2008 General/2008 General Election Results.txt")
colnames(gen.2008)[3] <- "county.code"
colnames(gen.2008)[4] <- "precinct.num"
colnames(gen.2008)[6] <- "district"
colnames(gen.2008)[9] <- "office.code"
colnames(gen.2008)[10] <- "party.code"
colnames(gen.2008)[12] <- "last.name"
colnames(gen.2008)[13] <- "first.name"
colnames(gen.2008)[16] <- "votes"

gen.2008$unique.precinct <- gen.2008$county.code*10000000+gen.2008$precinct.num
gen.2008.pres <- gen.2008[gen.2008$office.code=="USP",]
#total votes by precinct
total_votes_key <- gen.2008.pres %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2008.pres.parties <- gen.2008.pres[gen.2008.pres$party.code=="DEM" | gen.2008.pres$party.code=="REP", ]
gen.2008.pres.parties.sub <- subset(gen.2008.pres.parties,select=c("unique.precinct","last.name","votes"))
gen.2008.pres.parties.sub.wide <- reshape(gen.2008.pres.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2008.pres.parties.sub.wide <- gen.2008.pres.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))

#extract voteshares
gen.2008.pres.parties.sub.wide$obama.gen.08.pct <- gen.2008.pres.parties.sub.wide$votes.OBAMA / gen.2008.pres.parties.sub.wide$total_votes
gen.2008.pres.parties.sub.wide$mccain.gen.08.pct <- gen.2008.pres.parties.sub.wide$votes.MCCAIN / gen.2008.pres.parties.sub.wide$total_votes

### 2012 (Pres.) ------
gen.2012 <- read.csv("Unified Even Years/ER 2012 General/ErStat_2012_G/ERStat_2012_G(205268).txt")
colnames(gen.2012)[3] <- "county.code"
colnames(gen.2012)[4] <- "precinct.num"
colnames(gen.2012)[6] <- "district"
colnames(gen.2012)[9] <- "office.code"
colnames(gen.2012)[10] <- "party.code"
colnames(gen.2012)[12] <- "last.name"
colnames(gen.2012)[13] <- "first.name"
colnames(gen.2012)[16] <- "votes"

gen.2012$unique.precinct <- gen.2012$county.code*10000000+gen.2012$precinct.num
gen.2012.pres <- gen.2012[gen.2012$office.code=="USP",]
#total votes by precinct
total_votes_key <- gen.2012.pres %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2012.pres.parties <- gen.2012.pres[gen.2012.pres$party.code=="DEM" | gen.2012.pres$party.code=="REP", ]
gen.2012.pres.parties.sub <- subset(gen.2012.pres.parties,select=c("unique.precinct","last.name","votes"))
gen.2012.pres.parties.sub.wide <- reshape(gen.2012.pres.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2012.pres.parties.sub.wide <- gen.2012.pres.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))

#extract voteshares
gen.2012.pres.parties.sub.wide$obama.gen.12.pct <- gen.2012.pres.parties.sub.wide$votes.OBAMA / gen.2012.pres.parties.sub.wide$total_votes
gen.2012.pres.parties.sub.wide$romney.gen.12.pct <- gen.2012.pres.parties.sub.wide$votes.ROMNEY / gen.2012.pres.parties.sub.wide$total_votes

### 2016 (Pres.) ------
gen.2016 <- read.csv("Unified Even Years/ER 2016 General/ERStat_2016_G(204013)_20170406.txt")
colnames(gen.2016)[3] <- "county.code"
colnames(gen.2016)[4] <- "precinct.num"
colnames(gen.2016)[6] <- "district"
colnames(gen.2016)[9] <- "office.code"
colnames(gen.2016)[10] <- "party.code"
colnames(gen.2016)[12] <- "last.name"
colnames(gen.2016)[13] <- "first.name"
colnames(gen.2016)[16] <- "votes"

gen.2016$unique.precinct <- gen.2016$county.code*10000000+gen.2016$precinct.num
#subset office
gen.2016.pres <- gen.2016[gen.2016$office.code=="USP",]
#total votes by precinct
total_votes_key <- gen.2016.pres %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2016.pres.parties <- gen.2016.pres[gen.2016.pres$party.code=="DEM" | gen.2016.pres$party.code=="REP", ]
gen.2016.pres.parties.sub <- subset(gen.2016.pres.parties,select=c("unique.precinct","last.name","votes"))
gen.2016.pres.parties.sub.wide <- reshape(gen.2016.pres.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2016.pres.parties.sub.wide <- gen.2016.pres.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))

#extract voteshares
gen.2016.pres.parties.sub.wide$clinton.gen.16.pct <- gen.2016.pres.parties.sub.wide$votes.CLINTON / gen.2016.pres.parties.sub.wide$total_votes
gen.2016.pres.parties.sub.wide$trump.gen.16.pct <- gen.2016.pres.parties.sub.wide$votes.TRUMP / gen.2016.pres.parties.sub.wide$total_votes

### 2020 (Pres.) -------
gen.2020 <- read.csv("Unified Even Years/ER 2020 General/ERStat_2020_G(179551)_20210331.txt")
colnames(gen.2020)[3] <- "county.code"
colnames(gen.2020)[4] <- "precinct.num"
colnames(gen.2020)[6] <- "district"
colnames(gen.2020)[9] <- "office.code"
colnames(gen.2020)[10] <- "party.code"
colnames(gen.2020)[12] <- "last.name"
colnames(gen.2020)[13] <- "first.name"
colnames(gen.2020)[16] <- "votes"

gen.2020$unique.precinct <- gen.2020$county.code*10000000+gen.2020$precinct.num
#subset office
gen.2020.pres <- gen.2020[gen.2020$office.code=="USP",]
#total votes by precinct
total_votes_key <- gen.2020.pres %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2020.pres.parties <- gen.2020.pres[gen.2020.pres$party.code=="DEM" | gen.2020.pres$party.code=="REP", ]
gen.2020.pres.parties.sub <- subset(gen.2020.pres.parties,select=c("unique.precinct","last.name","votes"))
gen.2020.pres.parties.sub.wide <- reshape(gen.2020.pres.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2020.pres.parties.sub.wide <- gen.2020.pres.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))

#extract voteshares
gen.2020.pres.parties.sub.wide$biden.gen.20.pct <- gen.2020.pres.parties.sub.wide$votes.BIDEN/ gen.2020.pres.parties.sub.wide$total_votes
gen.2020.pres.parties.sub.wide$trump.gen.20.pct <- gen.2020.pres.parties.sub.wide$votes.TRUMP / gen.2020.pres.parties.sub.wide$total_votes

### 2010 (Senate) ------
gen.2010 <- read.csv("Unified Even Years/ER 2010 General/prcres.txt")
colnames(gen.2010)[3] <- "county.code"
colnames(gen.2010)[4] <- "precinct.num"
colnames(gen.2010)[6] <- "district"
colnames(gen.2010)[9] <- "office.code"
colnames(gen.2010)[10] <- "party.code"
colnames(gen.2010)[12] <- "last.name"
colnames(gen.2010)[13] <- "first.name"
colnames(gen.2010)[16] <- "votes"

gen.2010$unique.precinct <- gen.2010$county.code*10000000+gen.2010$precinct.num
#subset office
gen.2010.sen <- gen.2010[gen.2010$office.code=="USS",]

#total votes by precinct
total_votes_key <- gen.2010.sen %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2010.sen.parties <- gen.2010.sen[gen.2010.sen$party.code=="DEM" | gen.2010.sen$party.code=="REP", ]
gen.2010.sen.parties.sub <- subset(gen.2010.sen.parties,select=c("unique.precinct","last.name","votes"))
gen.2010.sen.parties.sub.wide <- reshape(gen.2010.sen.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2010.sen.parties.sub.wide <- gen.2010.sen.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))

gen.2010.sen.parties.sub.wide$sestak.gen.10.pct <- gen.2010.sen.parties.sub.wide$votes.SESTAK / gen.2010.sen.parties.sub.wide$total_votes
gen.2010.sen.parties.sub.wide$toomey.gen.10.pct <- gen.2010.sen.parties.sub.wide$votes.TOOMEY / gen.2010.sen.parties.sub.wide$total_votes

# 2016 (Senate) ------ 
gen.2016 <- read.csv("Unified Even Years/ER 2016 General/ERStat_2016_G(204013)_20170406.txt")
colnames(gen.2016)[3] <- "county.code"
colnames(gen.2016)[4] <- "precinct.num"
colnames(gen.2016)[6] <- "district"
colnames(gen.2016)[9] <- "office.code"
colnames(gen.2016)[10] <- "party.code"
colnames(gen.2016)[12] <- "last.name"
colnames(gen.2016)[13] <- "first.name"
colnames(gen.2016)[16] <- "votes"

gen.2016$unique.precinct <- gen.2016$county.code*10000000+gen.2016$precinct.num
#subset office
gen.2016.sen <- gen.2016[gen.2016$office.code=="USS",]

#total votes by precinct
total_votes_key <- gen.2016.sen %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2016.sen.parties <- gen.2016.sen[gen.2016.sen$party.code=="DEM" | gen.2016.sen$party.code=="REP", ]
gen.2016.sen.parties.sub <- subset(gen.2016.sen.parties,select=c("unique.precinct","last.name","votes"))
gen.2016.sen.parties.sub.wide <- reshape(gen.2016.sen.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2016.sen.parties.sub.wide <- gen.2016.sen.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))
gen.2016.sen.parties.sub.wide$mcginty.gen.16.pct <- gen.2016.sen.parties.sub.wide$votes.MCGINTY / gen.2016.sen.parties.sub.wide$total_votes
gen.2016.sen.parties.sub.wide$toomey.gen.16.pct <- gen.2016.sen.parties.sub.wide$votes.TOOMEY / gen.2016.sen.parties.sub.wide$total_votes


# 2018 (Senate) ------ 
gen.2018 <- read.csv("Unified Even Years/ER 2018 General/ERStat_2018_G(114234)_20190726.txt")
colnames(gen.2018)[3] <- "county.code"
colnames(gen.2018)[4] <- "precinct.num"
colnames(gen.2018)[6] <- "district"
colnames(gen.2018)[9] <- "office.code"
colnames(gen.2018)[10] <- "party.code"
colnames(gen.2018)[12] <- "last.name"
colnames(gen.2018)[13] <- "first.name"
colnames(gen.2018)[16] <- "votes"

gen.2018$unique.precinct <- gen.2018$county.code*10000000+gen.2018$precinct.num
#subset office
gen.2018.sen <- gen.2018[gen.2018$office.code=="USS",]

#total votes by precinct
total_votes_key <- gen.2018.sen %>%
  group_by(unique.precinct) %>%
  dplyr::summarize(total_votes = sum(votes,na.rm=T))
gen.2018.sen.parties <- gen.2018.sen[gen.2018.sen$party.code=="DEM" | gen.2018.sen$party.code=="REP", ]
gen.2018.sen.parties.sub <- subset(gen.2018.sen.parties,select=c("unique.precinct","last.name","votes"))
gen.2018.sen.parties.sub.wide <- reshape(gen.2018.sen.parties.sub,idvar="unique.precinct",timevar="last.name",direction="wide")

#merge total votes 
gen.2018.sen.parties.sub.wide <- gen.2018.sen.parties.sub.wide %>% 
  left_join(total_votes_key, by=c("unique.precinct"))

gen.2018.sen.parties.sub.wide$casey.gen.18.pct <- gen.2018.sen.parties.sub.wide$votes.CASEY / gen.2018.sen.parties.sub.wide$total_votes
gen.2018.sen.parties.sub.wide$barletta.gen.18.pct <- gen.2018.sen.parties.sub.wide$votes.BARLETTA / gen.2018.sen.parties.sub.wide$total_votes

gen.merged <- gen.2008.pres.parties.sub.wide %>%
  full_join(gen.2012.pres.parties.sub.wide, by="unique.precinct") %>% 
  full_join(gen.2016.pres.parties.sub.wide, by="unique.precinct") %>% 
  full_join(gen.2020.pres.parties.sub.wide, by="unique.precinct") %>%
  full_join(gen.2010.sen.parties.sub.wide, by="unique.precinct") %>%
  full_join(gen.2016.sen.parties.sub.wide, by="unique.precinct") %>%
  full_join(gen.2018.sen.parties.sub.wide, by="unique.precinct") %>%
  dplyr::select(-unique.precinct) %>%
  subset(., select = c(obama.gen.08.pct, sestak.gen.10.pct, obama.gen.12.pct, 
                       clinton.gen.16.pct, mcginty.gen.16.pct, casey.gen.18.pct, biden.gen.20.pct, 
                       mccain.gen.08.pct, toomey.gen.10.pct, romney.gen.12.pct,
                       trump.gen.16.pct, toomey.gen.16.pct, barletta.gen.18.pct, trump.gen.20.pct))

gen.merged.dem <- gen.merged[,1:7] #democratic candidates
gen.merged.gop <- gen.merged[,8:14] #republican candidates

cormatd.gop <- cor(gen.merged.gop,use="pairwise.complete.obs")
cormatd.gop2 <- cormatd.gop[,-c(1)]

cormatd.dem <- cor(gen.merged.dem,use="pairwise.complete.obs")
cormatd.dem2 <- cormatd.dem[,-c(1)]

precinct.num(gen.merged.gop)
precinct.num(gen.merged.dem)

print(xtable(cormatd.gop2, 
             label="f:tab:gen_election_corrs_PA_gop", 
             caption=c("Over-time correlations between Republican nominee vote shares from 2008 to 2020 in presidential and Senate races in PA (n=8992-9211)."),
             digits=c(0,rep(3,6))),
      file=c("tableA20.tex"))#table A20

tableA20 <- readLines("tableA20.tex")
tableA20 <- gsub("mccain.gen.08.pct", "McCain '08", tableA20)
tableA20 <- gsub("toomey.gen.10.pct", "Toomey '10", tableA20)
tableA20 <- gsub("romney.gen.12.pct", "Romney '12", tableA20)
tableA20 <- gsub("trump.gen.16.pct", "Trump '16", tableA20)
tableA20 <- gsub("toomey.gen.16.pct", "Toomey '16", tableA20)
tableA20 <- gsub("barletta.gen.18.pct", "Barletta '18", tableA20)
tableA20 <- gsub("trump.gen.20.pct", "Trump '20", tableA20)
writeLines(tableA20, "tableA20.tex")

print(xtable(cormatd.dem2, 
             label="tab:gen_election_corrs_PA_dems", 
             caption=c("Over-time correlations between Democratic nominee vote shares from 2008 to 2020 in presidential and Senate races in PA (n=8991-9211)."),
             digits=c(0,rep(3,6))),
      file=c("tableA21.tex"))#table A21

tableA21 <- readLines("tableA21.tex")
tableA21 <- gsub("obama.gen.08.pct", "Obama '08", tableA21)
tableA21 <- gsub("sestak.gen.10.pct", "Sestak '10", tableA21)
tableA21 <- gsub("obama.gen.12.pct", "Obama '12", tableA21)
tableA21 <- gsub("clinton.gen.16.pct", "Clinton '16", tableA21)
tableA21 <- gsub("mcginty.gen.16.pct", "McGinty '16", tableA21)
tableA21 <- gsub("casey.gen.18.pct", "Casey '18", tableA21)
tableA21 <- gsub("biden.gen.20.pct", "Biden '20", tableA21)
writeLines(tableA21, "tableA21.tex")

# Assess precinct overlap across elections (how many are we able to match?) ------ 

### precinct vote sums, primaries
precinct.counts <- function(dta,vote.col.name){
  return(dta%>%
    group_by(unique.precinct) %>%
    dplyr::summarise(votes = sum(as.numeric(votes)))%>%
      setNames(c("unique.precinct", vote.col.name))
    )
}

#pres (dem)
dta.pres.dem.08.sub.precinct.counts <- precinct.counts(dta.pres.dem.08.sub, "votes.08.pres")
dta.pres.dem.16.sub.precinct.counts <- precinct.counts(dta.pres.dem.16.sub, "votes.16.pres")
dta.pres.dem.20.sub.precinct.counts <- precinct.counts(dta.pres.dem.20.sub, "votes.20.pres")

#sen (dem)
dta.sen.dem.10.sub.precinct.counts <- precinct.counts(dta.sen.dem.10.sub, "votes.10.sen")
dta.sen.dem.16.sub.precinct.counts <- precinct.counts(dta.sen.dem.16.sub, "votes.16.sen")
dta.sen.dem.22.sub.precinct.counts <- precinct.counts(dta.sen.dem.22.sub, "votes.22.sen")

#merge
precinct.count.merged <- dta.pres.dem.08.sub.precinct.counts %>%
  full_join(dta.pres.dem.16.sub.precinct.counts, by = "unique.precinct") %>%
  full_join(dta.pres.dem.20.sub.precinct.counts, by = "unique.precinct") %>%
  full_join(dta.sen.dem.10.sub.precinct.counts, by = "unique.precinct") %>%
  full_join(dta.sen.dem.16.sub.precinct.counts, by = "unique.precinct") %>%
  full_join(dta.sen.dem.22.sub.precinct.counts, by = "unique.precinct")
  
#assess overlap 
frac.complete <- function(df, col1, col2) {
  df.tr<-df %>% 
    .[, c("unique.precinct",col1, col2)]%>% 
    setNames(c("V1", "V2", "V3")) %>% #get around "Can't transform a data frame with duplicate names."
    filter(!(is.na(V2)&is.na(V3)))
  return(sum(complete.cases(df.tr[, c("V2", "V3")])) / nrow(df.tr))
} #note that the denominator here is all precinct codes that appear in EITHER columns

precoverlap <- data.frame()

vote.cols<-colnames(precinct.count.merged)[-1]
for (i in 1:(length(vote.cols)-1)) {
  for (j in (i+1):length(vote.cols)) {
    fraction <- round(frac.complete(precinct.count.merged, vote.cols[i], vote.cols[j]),2)
    precoverlap <- rbind(precoverlap, data.frame(race1 = vote.cols[i], 
                                                 race2 = vote.cols[j], 
                                                 fraction.overlap = fraction))
  }
}

precoverlap<-precoverlap%>%
  distinct()%>%
  mutate(race1=race1%>%recode(
    "votes.08.pres"="President, 2008",
    "votes.16.pres"="President, 2016", 
    "votes.20.pres"="President, 2020", 
    "votes.10.sen"="Senate, 2010", 
    "votes.22.sen"="Senate, 2016", 
    "votes.16.sen"="Senate, 2022"
  ))%>% 
  mutate(race2=race2%>%recode(
    "votes.08.pres"="President, 2008",
    "votes.16.pres"="President, 2016", 
    "votes.20.pres"="President, 2020", 
    "votes.10.sen"="Senate, 2010", 
    "votes.22.sen"="Senate, 2016", 
    "votes.16.sen"="Senate, 2022"
  ))%>% 
  setNames(c("Campaign 1", "Campaign 2", "Precinct Overlap"))

print(xtable(precoverlap,
             digits=c(0,0,3,3),
             label="t:pa:precinct:overlap",
             caption="Fraction of precincts that overlap in Democratic primaries in PA across campaigns. The analyses include only precincts that we are able to match across elections."), 
      include.rownames=F,
      file="tableA22.tex")#table A22
